'''
Created on 2017年12月20日

@author: yqm
'''
import pandas

file_dir = "G:\\研究生\\实验\\语料\\titanic_train.csv"
titanic = pandas.read_csv(file_dir)
# print(titanic.head(5))

# Age列存在缺失项，用Age的平均数填充缺失值(NaN格式)
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
# print(titanic["Sex"].unique())
# print(titanic["Embarked"].unique())  # unique()函数查看一共有多少重复的值 例：['S' 'C' 'Q' nan]
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
# print(titanic.describe())#输出总数，均值。。等信息

from sklearn.linear_model import LinearRegression #引入线性回归
from sklearn.cross_validation import KFold #交叉验证库，将测试集进行切分交叉取平均
#传入模型的特征
predictors = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
alg = LinearRegression() #实例化模型
# print(titanic.shape)
# 将m个样本平均分成3份进行交叉验证
# titanic.shape输出样本集的行和列本例输出为(891, 12)，shape[0]指第一个列的值
# 本例titanic.shape[0]为样本的个数
# 根据参数n和n_folds将n个样本分成n_folds份。每次验证过程选取其中1份作为测试集，剩下的n_folds-1份作为训练集，并且做n_folds次这样的验证
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
# print(kf)
predictions = []
for train, test in kf:
    train_predictions = (titanic[predictors].iloc[train,:])#将predictors作为测试特征
    train_target = titanic["Survived"].iloc[train] #训练集标签值
    alg.fit(train_predictions, train_target) #训练数据
    #用训练好的模型预测数据
    test_prediction = alg.predict(titanic[predictors].iloc[test,:])
#     print(test_prediction)
    predictions.append(test_prediction)
print(predictions)
    


























































